
clear all
set more off

global dir /Volumes/Zihao_SSD2/PatentsView

*** This part cleans inventor names
*** Zihao Li. 06/2024
import delimited $dir/rawdata/g_inventor_disambiguated.tsv, clear
keep patent_id inventor_id disambig_inventor_name_first disambig_inventor_name_last inventor_sequence
rename (disambig_inventor_name_first disambig_inventor_name_last) (first_name last_name)
sort first_name
drop if first_name == ""
format %30s inventor_id first_name last_name

* clean suffixes
replace first_name = subinstr(first_name, ".O slashed.", "O", .)
replace first_name = subinstr(first_name, ".0.", "O", .)
replace first_name = substr(first_name, 4, .) if substr(first_name, 1, 3) == "by "
replace first_name = subinstr(first_name, "-Ing ", "", .)
replace first_name = subinstr(first_name, "-Ing. ", "", .)

replace last_name = subinstr(last_name, ".O slashed.", "O", .)
replace last_name = subinstr(last_name, ".o slashed.", "o", .)
replace last_name = subinstr(last_name, ", Jr.", "", .)
replace last_name = subinstr(last_name, ", Jr,", "", .)
replace last_name = subinstr(last_name, ", Jr", "", .)
replace last_name = subinstr(last_name, ". Jr.", "", .)
replace last_name = subinstr(last_name, ". Jr", "", .)
replace last_name = subinstr(last_name, " Jr.", "", .)
replace last_name = substr(last_name, 1, length(last_name)-3) if substr(last_name, -3, .) == " Jr"
replace last_name = subinstr(last_name, "-Jr.", "", .)
replace last_name = subinstr(last_name, ", JR.", "", .)
replace last_name = subinstr(last_name, ", JR,", "", .)
replace last_name = subinstr(last_name, ". JR.", "", .)
replace last_name = subinstr(last_name, ". JR", "", .)
replace last_name = subinstr(last_name, ", JR", "", .)
replace last_name = subinstr(last_name, " JR.", "", .)
replace last_name = subinstr(last_name, ", X", "", .)
replace last_name = subinstr(last_name, ", VIII", "", .)
replace last_name = subinstr(last_name, ", V.", "", .)
replace last_name = subinstr(last_name, ", V", "", .)
replace last_name = substr(last_name, 1, length(last_name)-2) if substr(last_name, -2, .) == " V"
replace last_name = subinstr(last_name, ", IV.", "", .)
replace last_name = subinstr(last_name, ", IV", "", .)
replace last_name = subinstr(last_name, " IV", "", .)
replace last_name = subinstr(last_name, ", III.", "", .)
replace last_name = subinstr(last_name, ", III", "", .)
replace last_name = subinstr(last_name, ",III", "", .)
replace last_name = subinstr(last_name, " III", "", .)
replace last_name = subinstr(last_name, ", II.", "", .)
replace last_name = subinstr(last_name, ", II", "", .)
replace last_name = subinstr(last_name, ",II", "", .)
replace last_name = subinstr(last_name, " II", "", .)
replace last_name = subinstr(last_name, ", I.", "", .)
replace last_name = subinstr(last_name, ", I", "", .)
replace last_name = subinstr(last_name, ", Sr.", "", .)
replace last_name = subinstr(last_name, ", SR.", "", .)
replace last_name = subinstr(last_name, ", 2nd", "", .)
replace last_name = substr(last_name, 1, length(last_name)-2) if substr(last_name, -2, .) == " 3"
replace last_name = subinstr(last_name, ", 3rd", "", .)
replace last_name = subinstr(last_name, " 3rd", "", .)
replace last_name = subinstr(last_name, ", 3", "", .)
replace last_name = subinstr(last_name, ", 5th", "", .)
replace last_name = subinstr(last_name, ".0.", "", .)
replace last_name = subinstr(last_name, "0'", "O'", .)

replace last_name = subinstr(last_name, ", deceased", "", .)
replace last_name = subinstr(last_name, ", heiress", "", .)
replace last_name = subinstr(last_name, ", heir-at-law", "", .)
replace last_name = subinstr(last_name, ", heir at Law", "", .)
replace last_name = subinstr(last_name, ", heir", "", .)
replace last_name = subinstr(last_name, ", sole heir", "", .)
replace last_name = subinstr(last_name, ", legal heir", "", .)
replace last_name = subinstr(last_name, ", all legar heirs", "", .)
replace last_name = subinstr(last_name, ", all legal heirs", "", .)
replace last_name = subinstr(last_name, ", legal authorized heir", "", .)
replace last_name = subinstr(last_name, ", executor", "", .)
replace last_name = subinstr(last_name, ", Executor", "", .)
replace last_name = subinstr(last_name, ", co-executor", "", .)
replace last_name = subinstr(last_name, ", executrix", "", .)
replace last_name = subinstr(last_name, ", Executrix", "", .)
replace last_name = subinstr(last_name, ", co-executrix", "", .)
replace last_name = subinstr(last_name, ", administrators", "", .)
replace last_name = subinstr(last_name, ", administrator DBN CTA", "", .)
replace last_name = subinstr(last_name, ", administrator", "", .)
replace last_name = subinstr(last_name, ", Administrators", "", .)
replace last_name = subinstr(last_name, ", Administrator", "", .)
replace last_name = subinstr(last_name, ", administratrix", "", .)
replace last_name = subinstr(last_name, ", Administratrix", "", .)
replace last_name = subinstr(last_name, ", legatee", "", .)
replace last_name = subinstr(last_name, ", trustee", "", .)
replace last_name = subinstr(last_name, ", co-successor trustee", "", .)
replace last_name = subinstr(last_name, ", representative", "", .)
replace last_name = subinstr(last_name, ", personal representative", "", .)
replace last_name = subinstr(last_name, ", joint personal representative", "", .)
replace last_name = subinstr(last_name, ", co-personal representative", "", .)
replace last_name = subinstr(last_name, ", legal representative", "", .)
replace last_name = subinstr(last_name, ", surviving spouse", "", .)
replace last_name = subinstr(last_name, ", Assignee of the Entire Estate of Ralph D. Wichman", "", .)

replace last_name = subinstr(last_name, ", nee Rudy", " nee Rudy", .)
replace last_name = subinstr(last_name, "2imura", "Imura", .)

* we first concat first_name and last_name, because first_name is actually a combination of first and middle names
gen inventor_name = first_name + " " + last_name

* clean special characters
replace inventor_name = subinstr(inventor_name, "?", "", .)
replace inventor_name = subinstr(inventor_name, "!", "", .)
replace inventor_name = subinstr(inventor_name, "(", "", .)
replace inventor_name = subinstr(inventor_name, ")", "", .)
replace inventor_name = subinstr(inventor_name, "{", "", .)
replace inventor_name = subinstr(inventor_name, "}", "", .)
replace inventor_name = subinstr(inventor_name, "[", "", .)
replace inventor_name = subinstr(inventor_name, "]", "", .)
replace inventor_name = subinstr(inventor_name, "/ ", "", .)
replace inventor_name = subinstr(inventor_name, "/", "", .)
replace inventor_name = subinstr(inventor_name, "'", "", .)
replace inventor_name = subinstr(inventor_name, `"""', "", .)
replace inventor_name = subinstr(inventor_name, ";", "", .)
replace inventor_name = subinstr(inventor_name, "¸", "", .)
replace inventor_name = subinstr(inventor_name, "¥", "", .)
replace inventor_name = subinstr(inventor_name, "¶", "", .)
replace inventor_name = subinstr(inventor_name, "¨", "", .)
replace inventor_name = subinstr(inventor_name, "¤", "", .)
replace inventor_name = subinstr(inventor_name, "±", "", .)
replace inventor_name = subinstr(inventor_name, "&", "", .)
replace inventor_name = subinstr(inventor_name, "§a", "", .)
replace inventor_name = subinstr(inventor_name, "A", "Anel", .)

replace inventor_name = subinstr(inventor_name, "å", "a", .)
replace inventor_name = subinstr(inventor_name, "á", "a", .)
replace inventor_name = subinstr(inventor_name, "à", "a", .)
replace inventor_name = subinstr(inventor_name, "ä", "a", .)
replace inventor_name = subinstr(inventor_name, "æ", "ae", .)
replace inventor_name = subinstr(inventor_name, "ã", "a", .)
replace inventor_name = subinstr(inventor_name, "â", "a", .)
replace inventor_name = subinstr(inventor_name, "a¹", "a", .)
replace inventor_name = subinstr(inventor_name, "Á", "A", .)
replace inventor_name = subinstr(inventor_name, "Å", "A", .)
replace inventor_name = subinstr(inventor_name, "Ä°", "A", .)
replace inventor_name = subinstr(inventor_name, "Ä", "A", .)
replace inventor_name = subinstr(inventor_name, "Ã", "A", .)
replace inventor_name = subinstr(inventor_name, "Ā", "A", .)
replace inventor_name = subinstr(inventor_name, "Ã", "A", .)
replace inventor_name = subinstr(inventor_name, "Â", "A", .)
replace inventor_name = subinstr(inventor_name, "Ã³", "A", .)
replace inventor_name = subinstr(inventor_name, "A³", "A", .)
replace inventor_name = subinstr(inventor_name, "Ã¼", "A", .)
replace inventor_name = subinstr(inventor_name, "A¼", "A", .)


replace inventor_name = subinstr(inventor_name, "ß", "b", .)

replace inventor_name = subinstr(inventor_name, "č", "c", .)
replace inventor_name = subinstr(inventor_name, "ć", "c", .)
replace inventor_name = subinstr(inventor_name, "c̆", "c", .)
replace inventor_name = subinstr(inventor_name, "Ç", "C", .)
replace inventor_name = subinstr(inventor_name, "ç", "c", .)
replace inventor_name = subinstr(inventor_name, "¢", "c", .)
replace inventor_name = subinstr(inventor_name, "Č", "C", .)
replace inventor_name = subinstr(inventor_name, "©", "c", .)
replace inventor_name = subinstr(inventor_name, "Ç", "C", .)

replace inventor_name = subinstr(inventor_name, "É", "E", .)
replace inventor_name = subinstr(inventor_name, "È", "E", .)
replace inventor_name = subinstr(inventor_name, "é", "e", .)
replace inventor_name = subinstr(inventor_name, "è", "e", .)
replace inventor_name = subinstr(inventor_name, "ě", "e", .)
replace inventor_name = subinstr(inventor_name, "ë", "e", .)
replace inventor_name = subinstr(inventor_name, "ȩ", "e", .)

replace inventor_name = subinstr(inventor_name, "ǧ", "g", .)
replace inventor_name = subinstr(inventor_name, "ğ", "g", .)
replace inventor_name = subinstr(inventor_name, "ǧ̃", "g", .)
replace inventor_name = subinstr(inventor_name, "g̃", "g", .)
replace inventor_name = subinstr(inventor_name, "gˇ", "g", .)

replace inventor_name = subinstr(inventor_name, "İ", "I", .)
replace inventor_name = subinstr(inventor_name, "Î", "I", .)
replace inventor_name = subinstr(inventor_name, "Í", "I", .)
replace inventor_name = subinstr(inventor_name, "Ì", "I", .)
replace inventor_name = subinstr(inventor_name, "i̇", "i", .)
replace inventor_name = subinstr(inventor_name, "ı̈", "i", .)
replace inventor_name = subinstr(inventor_name, "ï", "i", .)
replace inventor_name = subinstr(inventor_name, "ı¨", "i", .)
replace inventor_name = subinstr(inventor_name, "ı́", "i", .)
replace inventor_name = subinstr(inventor_name, "í", "i", .)
replace inventor_name = subinstr(inventor_name, "ĭ", "i", .)
replace inventor_name = subinstr(inventor_name, "iˇ", "i", .)
replace inventor_name = subinstr(inventor_name, "iˆ", "i", .)
replace inventor_name = subinstr(inventor_name, "ı", "i", .)
replace inventor_name = subinstr(inventor_name, "î", "i", .)
replace inventor_name = subinstr(inventor_name, "¡", "i", .)

replace inventor_name = subinstr(inventor_name, "Ł", "L", .)
replace inventor_name = subinstr(inventor_name, "Ľ", "L", .)
replace inventor_name = subinstr(inventor_name, "ł", "l", .)

replace inventor_name = subinstr(inventor_name, "Ñ", "N", .)
replace inventor_name = subinstr(inventor_name, "ń", "n", .)
replace inventor_name = subinstr(inventor_name, "ñ", "n", .)

replace inventor_name = subinstr(inventor_name, "Ø", "O", .)
replace inventor_name = subinstr(inventor_name, "Ó", "O", .)
replace inventor_name = subinstr(inventor_name, "Ö", "O", .)
replace inventor_name = subinstr(inventor_name, "Ò", "O", .)
replace inventor_name = subinstr(inventor_name, "Ō", "O", .)
replace inventor_name = subinstr(inventor_name, "ö", "o", .)
replace inventor_name = subinstr(inventor_name, "Ó", "O", .)
replace inventor_name = subinstr(inventor_name, "ó", "o", .)
replace inventor_name = subinstr(inventor_name, "ò", "o", .)
replace inventor_name = subinstr(inventor_name, "ø", "o", .)
replace inventor_name = subinstr(inventor_name, "ô", "o", .)
replace inventor_name = subinstr(inventor_name, "ő", "o", .)
replace inventor_name = subinstr(inventor_name, "œ", "oe", .)

replace inventor_name = subinstr(inventor_name, "ř", "r", .)

replace inventor_name = subinstr(inventor_name, "š", "s", .)
replace inventor_name = subinstr(inventor_name, "ş", "s", .)
replace inventor_name = subinstr(inventor_name, "Š", "S", .)
replace inventor_name = subinstr(inventor_name, "Ş", "S", .)
replace inventor_name = subinstr(inventor_name, "Ś", "S", .)

replace inventor_name = subinstr(inventor_name, "ü", "u", .)
replace inventor_name = subinstr(inventor_name, "ú", "u", .)
replace inventor_name = subinstr(inventor_name, "ü", "u", .)
replace inventor_name = subinstr(inventor_name, "Ü", "U", .)
replace inventor_name = subinstr(inventor_name, "Ú", "U", .)

replace inventor_name = subinstr(inventor_name, "ý", "y", .)

replace inventor_name = subinstr(inventor_name, "Ž", "Z", .)
replace inventor_name = subinstr(inventor_name, "ž", "z", .)

replace inventor_name = subinstr(inventor_name, "3ric ", "Eric ", .)


** Extract inventor FirstName
replace inventor_name = strltrim(inventor_name)
replace inventor_name = strrtrim(inventor_name)
split inventor_name, parse(" ")

* Add spaces after dots if they aren't there already. Also add dots to middle initials
forval y = 1/11 {
	replace inventor_name`y' = subinstr(inventor_name`y', ".", ". ", .)
	replace inventor_name`y' = strrtrim(inventor_name`y')
	replace inventor_name`y' = inventor_name`y' + "." if ustrregexm(inventor_name`y', "[A-Z]$") & ustrlen(inventor_name`y') == 1
}

* Reshape into a full_name format
sort inventor_name
replace inventor_name = inventor_name1
forval y = 2/11 {
	replace inventor_name = inventor_name + " " + inventor_name`y' if !missing(inventor_name`y')
}
drop inventor_name1 inventor_name2 inventor_name3 inventor_name4 inventor_name5 inventor_name6 inventor_name7 inventor_name8 inventor_name9 inventor_name10 inventor_name11

replace inventor_name = proper(inventor_name)
replace inventor_name = substr(inventor_name, 3, .) if substr(inventor_name, 1, 2) == ". "

* Change x. xxx xxx into xxx x. xxx
split inventor_name, parse(" ")
gen inventor_namenew1 = inventor_name1

replace inventor_namenew1 = inventor_name2 if ustrregexm(inventor_name1, "^[A-Z]\.")==1 & ustrregexm(inventor_name2, "[A-Za-z]\.") == 0 & lower(inventor_name2) != "de" & lower(inventor_name2) != "van" & lower(inventor_name2) != "von" & lower(inventor_name2) != "la" & inventor_name3 != ""

replace inventor_name2 = inventor_name1 if inventor_name1 != inventor_namenew1

gen inventor_namenew = inventor_namenew1
forval y = 2/11 {
	replace inventor_namenew = inventor_namenew + " " + inventor_name`y' if !missing(inventor_name`y')
}

order patent_id inventor_name inventor_namenew inventor_namenew1
drop inventor_name
rename inventor_namenew inventor_name

* Generate last name (last non-missing entry)
gen last_name_clean = ""
forval y = 1/11 {
	replace last_name_clean = inventor_name`y' if !missing(inventor_name`y') 
}

* Generate first name
rename inventor_namenew1 first_name_clean
format %30s inventor_name

drop inventor_name1 inventor_name2 inventor_name3 inventor_name4 inventor_name5 inventor_name6 inventor_name7 inventor_name8 inventor_name9 inventor_name10 inventor_name11

order patent_id inventor_name first_name first_name_clean last_name last_name_clean
replace inventor_name = substr(inventor_name, 2, .) if substr(inventor_name, 1, 1) == "-"
replace first_name_clean = substr(first_name_clean, 2, .) if substr(first_name_clean, 1, 1) == "-"
replace last_name_clean = substr(last_name_clean, 2, .) if substr(last_name_clean, 1, 1) == "-"

* Individual fixes for last names
replace last_name_clean = "Ashok" if strpos(last_name, "Ashok") & length(last_name_clean) == 2
replace last_name_clean = "Ayyagari" if strpos(last_name, "Ayyagari") & length(last_name_clean) == 2
replace last_name_clean = "Aldrewy" if strpos(last_name, "Aldrey") & length(last_name_clean) == 2
replace last_name_clean = "Acharya" if strpos(last_name, "Acharya") & length(last_name_clean) == 2
replace last_name_clean = "Marino" if strpos(last_name, "Marino") & length(last_name_clean) == 2
replace last_name_clean = "Kumar" if strpos(last_name, "Kumar") & length(last_name_clean) == 2
replace last_name_clean = "Vargis" if strpos(last_name, "Vargis") & length(last_name_clean) == 2
replace last_name_clean = "Gonzalez" if strpos(last_name, "Gonzalez") & length(last_name_clean) == 2
replace last_name_clean = "Bueno" if strpos(last_name, "Bueno") & length(last_name_clean) == 2
replace last_name_clean = "Bahadur" if strpos(last_name, "Bahadur") & length(last_name_clean) == 2
replace last_name_clean = "Babu" if strpos(last_name, "Babu") & length(last_name_clean) == 2
replace last_name_clean = "Balaji" if strpos(last_name, "Balaji") & length(last_name_clean) == 2
replace last_name_clean = "Basha" if strpos(last_name, "Basha") & length(last_name_clean) == 2
replace last_name_clean = "Bayona" if strpos(last_name, "Bayona") & length(last_name_clean) == 2
replace last_name_clean = "Bhargava" if strpos(last_name, "Bhargava") & length(last_name_clean) == 2
replace last_name_clean = "Bhushan" if strpos(last_name, "Bhushan") & length(last_name_clean) == 2
replace last_name_clean = "Bernal" if strpos(last_name, "Bernal") & length(last_name_clean) == 2
replace last_name_clean = "Chandagalu" if strpos(last_name, "Chandagalu") & length(last_name_clean) == 2
replace last_name_clean = "Calderon" if strpos(last_name, "Calderon") & length(last_name_clean) == 2
replace last_name_clean = "Cespedes" if strpos(last_name, "Cespedes") & length(last_name_clean) == 2
replace last_name_clean = "Chacin" if strpos(last_name, "Chacin") & length(last_name_clean) == 2
replace last_name_clean = "Chandra" if strpos(last_name, "Chandra") & length(last_name_clean) == 2
replace last_name_clean = "Chandar" if strpos(last_name, "Chandar") & length(last_name_clean) == 2
replace last_name_clean = "Cuevas" if strpos(last_name, "Cuevas") & length(last_name_clean) == 2
replace last_name_clean = "Dam" if strpos(last_name, "Dam") & length(last_name_clean) == 2
replace last_name_clean = "David" if strpos(last_name, "David") & length(last_name_clean) == 2
replace last_name_clean = "James" if strpos(last_name, "James") & length(last_name_clean) == 2
replace last_name_clean = "Robert" if strpos(last_name, "Robert") & length(last_name_clean) == 2
replace last_name_clean = "Esquivel" if strpos(last_name, "Esquivel") & length(last_name_clean) == 2
replace last_name_clean = "Figueroa" if strpos(last_name, "Figueroa") & length(last_name_clean) == 2
replace last_name_clean = "Fischmann" if strpos(last_name, "Fischmann") & length(last_name_clean) == 2
replace last_name_clean = "Gadey" if strpos(last_name, "Gadey") & length(last_name_clean) == 2
replace last_name_clean = "Govindillam" if strpos(last_name, "Govindillam") & length(last_name_clean) == 2
replace last_name_clean = "Gutierrez" if strpos(last_name, "Gutierrez") & length(last_name_clean) == 2
replace last_name_clean = "Herrera" if strpos(last_name, "Herrera") & length(last_name_clean) == 2
replace last_name_clean = "Hoefken" if strpos(last_name, "Hoefken") & length(last_name_clean) == 2
replace last_name_clean = "Jimenez" if strpos(last_name, "Jimenez") & length(last_name_clean) == 2
replace last_name_clean = "Khan" if strpos(last_name, "Khan") & length(last_name_clean) == 2
replace last_name_clean = "Krishnan" if strpos(last_name, "Krishnan") & length(last_name_clean) == 2
replace last_name_clean = "Krishna" if strpos(last_name, "Krishna") & length(last_name_clean) == 2
replace last_name_clean = "Laya" if strpos(last_name, "Laya") & length(last_name_clean) == 2
replace last_name_clean = "Layrisse" if strpos(last_name, "Layrisse") & length(last_name_clean) == 2
replace last_name_clean = "Lenchig" if strpos(last_name, "Lenchig") & length(last_name_clean) == 2
replace last_name_clean = "Marin" if strpos(last_name, "Marin") & length(last_name_clean) == 2
replace last_name_clean = "Marvin" if strpos(last_name, "Marvin") & length(last_name_clean) == 2
replace last_name_clean = "Mendez" if strpos(last_name, "Mendez") & length(last_name_clean) == 2
replace last_name_clean = "Mohaideen" if strpos(last_name, "Mohaideen") & length(last_name_clean) == 2
replace last_name_clean = "Mohan" if strpos(last_name, "Mohan") & length(last_name_clean) == 2
replace last_name_clean = "Murthy" if strpos(last_name, "Murthy") & length(last_name_clean) == 2
replace last_name_clean = "Nagy" if strpos(last_name, "Nagy") & length(last_name_clean) == 2
replace last_name_clean = "Nampoothiri" if strpos(last_name, "Nampoothiri") & length(last_name_clean) == 2
replace last_name_clean = "Narayanan" if strpos(last_name, "Narayanan") & length(last_name_clean) == 2
replace last_name_clean = "Narayan" if strpos(last_name, "Narayan") & length(last_name_clean) == 2
replace last_name_clean = "Nayaka" if strpos(last_name, "Nayaka") & length(last_name_clean) == 2
replace last_name_clean = "Njiende" if strpos(last_name, "Njiende") & length(last_name_clean) == 2
replace last_name_clean = "Nrusimhan" if strpos(last_name, "Nrusimhan") & length(last_name_clean) == 2
replace last_name_clean = "Pena" if strpos(last_name, "Pena") & length(last_name_clean) == 2
replace last_name_clean = "Perez" if strpos(last_name, "Perez") & length(last_name_clean) == 2
replace last_name_clean = "Pasha" if strpos(last_name, "Pasha") & length(last_name_clean) == 2
replace last_name_clean = "Plank" if strpos(last_name, "Plank") & length(last_name_clean) == 2
replace last_name_clean = "Pai" if strpos(last_name, "Pai") & length(last_name_clean) == 2
replace last_name_clean = "Petromanolakis" if strpos(last_name, "Petromanolakis") & length(last_name_clean) == 2
replace last_name_clean = "Prabhu" if strpos(last_name, "Prabhu") & length(last_name_clean) == 2
replace last_name_clean = "Prabu" if strpos(last_name, "Prabu") & length(last_name_clean) == 2
replace last_name_clean = "Prakash" if strpos(last_name, "Prakash") & length(last_name_clean) == 2
replace last_name_clean = "Prasad" if strpos(last_name, "Prasad") & length(last_name_clean) == 2
replace last_name_clean = "Rahal" if strpos(last_name, "Rahal") & length(last_name_clean) == 2
replace last_name_clean = "Rahul" if strpos(last_name, "Rahul") & length(last_name_clean) == 2
replace last_name_clean = "Rajadurai" if strpos(last_name, "Rajadurai") & length(last_name_clean) == 2
replace last_name_clean = "Raj" if strpos(last_name, "Raj") & length(last_name_clean) == 2
replace last_name_clean = "Rao" if strpos(last_name, "Rao") & length(last_name_clean) == 2
replace last_name_clean = "Raman" if strpos(last_name, "Raman") & length(last_name_clean) == 2
replace last_name_clean = "Reddy" if strpos(last_name, "Reddy") & length(last_name_clean) == 2
replace last_name_clean = "Ryan" if strpos(last_name, "Ryan") & length(last_name_clean) == 2
replace last_name_clean = "Sarma" if strpos(last_name, "Sarma") & length(last_name_clean) == 2
replace last_name_clean = "Sait" if strpos(last_name, "Sait") & length(last_name_clean) == 2
replace last_name_clean = "Salazar" if strpos(last_name, "Salazar") & length(last_name_clean) == 2
replace last_name_clean = "Setty" if strpos(last_name, "Setty") & length(last_name_clean) == 2
replace last_name_clean = "Sevilla" if strpos(last_name, "Sevilla") & length(last_name_clean) == 2
replace last_name_clean = "Shankar" if strpos(last_name, "Shankar") & length(last_name_clean) == 2
replace last_name_clean = "Shetty" if strpos(last_name, "Shetty") & length(last_name_clean) == 2
replace last_name_clean = "Sreedhar" if strpos(last_name, "Sreedhar") & length(last_name_clean) == 2
replace last_name_clean = "Sucre" if strpos(last_name, "Sucre") & length(last_name_clean) == 2
replace last_name_clean = "Sundaram" if strpos(last_name, "Sundaram") & length(last_name_clean) == 2
replace last_name_clean = "Sundar" if strpos(last_name, "Sundar") & length(last_name_clean) == 2
replace last_name_clean = "Traverso" if strpos(last_name, "Traverso") & length(last_name_clean) == 2
replace last_name_clean = "Vadasz" if strpos(last_name, "Vadasz") & length(last_name_clean) == 2
replace last_name_clean = "Vincenzo" if strpos(last_name, "Vincenzo") & length(last_name_clean) == 2
replace last_name_clean = "Ynclino" if strpos(last_name, "Ynclino") & length(last_name_clean) == 2
replace last_name_clean = "Yu" if strpos(last_name, "Yu") & length(last_name_clean) == 2
replace last_name_clean = "Zambrano" if strpos(last_name, "Zambrano") & length(last_name_clean) == 2

gen inventor_name_nomid = first_name_clean + " " + last_name_clean
order inventor_name_nomid, after(inventor_name)
order inventor_sequence, after(inventor_name_nomid)
sort patent_id inventor_sequence

* Export dataset
save $dir/temp/g_inventor_clean.dta, replace
export delimited using $dir/cleandata/g_inventor_clean.csv, replace

* Export firstname to genderize
preserve
keep first_name_clean
sort first_name_clean
replace first_name_clean = lower(first_name_clean)
duplicates drop first_name_clean, force
export delimited using $dir/temp/g_inventor_clean_firstname.csv, replace
restore

* Export inventor_name_nomid to racialize (because ethnicolr only takes in firstname and lastname)
preserve
duplicates drop inventor_name_nomid, force
keep inventor_name_nomid first_name_clean last_name_clean
replace first_name_clean = lower(first_name_clean)
replace last_name_clean = lower(last_name_clean)
export delimited using $dir/temp/g_inventor_clean_nomidname.csv, replace
restore




*************** Second Part: merges inventors' gender and race **************************
global dir /Volumes/Zihao_SSD2/PatentsView

import delimited $dir/temp/g_inventor_firstname_gendered_r3.csv, clear
replace gender = "ambiguous" if gender == ""
rename (round count probability) (gender_round gender_count gender_prob)
drop name_cleaned name
save $dir/temp/g_inventor_gender.dta, replace

import delimited $dir/temp/g_inventor_race.csv, clear
rename (asian hispanic nh_white nh_black) (asian_prob hispanic_prob nh_white_prob nh_black_prob)
gen race90 = race
replace race90 = "ambiguous" if asian_prob<0.9 & hispanic_prob<0.9 & nh_white_prob<0.9 & nh_black_prob<0.9
gen race80 = race
replace race80 = "ambiguous" if asian_prob<0.8 & hispanic_prob<0.8 & nh_white_prob<0.8 & nh_black_prob<0.8
gen race70 = race
replace race70 = "ambiguous" if asian_prob<0.7 & hispanic_prob<0.7 & nh_white_prob<0.7 & nh_black_prob<0.7

merge 1:m inventor_name_nomid using $dir/temp/g_inventor_clean.dta // 20,409,866
drop if _merge != 3
drop _merge
format %20s first_name_clean last_name_clean first_name last_name
sort patent_id inventor_name

merge m:1 first_name_clean using $dir/temp/g_inventor_gender.dta // 20,409,655
drop if _merge != 3
drop _merge
drop if race == ""

* Different thresholds for gender
* "gender" variable is raw output from Genderize.io
gen gender_09_0 = gender
replace gender_09_0 = "ambiguous" if gender_prob<0.9
gen gender_09_50 = gender
replace gender_09_50 = "ambiguous" if gender_prob<0.9 | gender_count<50
gen gender_09_100 = gender
replace gender_09_100 = "ambiguous" if gender_prob<0.9 | gender_count<100
gen gender_io_09_100 = gender_09_100

gen gender_08_0 = gender
replace gender_08_0 = "ambiguous" if gender_prob<0.8
gen gender_08_50 = gender
replace gender_08_50 = "ambiguous" if gender_prob<0.8 | gender_count<50
gen gender_08_100 = gender
replace gender_08_100 = "ambiguous" if gender_prob<0.8 | gender_count<100
gen gender_io_08_100 = gender_08_100

gen gender_07_0 = gender
replace gender_07_0 = "ambiguous" if gender_prob<0.7
gen gender_07_50 = gender
replace gender_07_50 = "ambiguous" if gender_prob<0.7 | gender_count<50
gen gender_07_100 = gender
replace gender_07_100 = "ambiguous" if gender_prob<0.7 | gender_count<100
gen gender_io_07_100 = gender_07_100

gen gender_06_0 = gender
replace gender_06_0 = "ambiguous" if gender_prob<0.6
gen gender_06_50 = gender
replace gender_06_50 = "ambiguous" if gender_prob<0.6 | gender_count<50
gen gender_06_100 = gender
replace gender_06_100 = "ambiguous" if gender_prob<0.6 | gender_count<100
gen gender_io_06_100 = gender_06_100

gen gender_05_0 = gender
replace gender_05_0 = "ambiguous" if gender_prob<0.5
gen gender_05_50 = gender
replace gender_05_50 = "ambiguous" if gender_prob<0.5 | gender_count<50
gen gender_05_100 = gender
replace gender_05_100 = "ambiguous" if gender_prob<0.5 | gender_count<100
gen gender_io_05_100 = gender_05_100

order patent_id inventor_name inventor_sequence first_name_clean last_name_clean gender race gender_09_100 gender_io_09_100 race90 race80 race70 gender_prob gender_count
sort patent_id inventor_sequence

save $dir/temp/g_inventor_gender_race_temp.dta, replace


* Raw data from PatentsView
import delimited $dir/rawdata/g_inventor_disambiguated.tsv, varnames(1) clear
replace male_flag = -1 if male_flag == .
keep inventor_id male_flag
duplicates drop inventor_id, force

merge 1:m inventor_id using $dir/temp/g_inventor_gender_race_temp.dta
drop if _merge != 3
drop _merge
format %30s inventor_id

* Gender variables
replace gender = "male" if male_flag == 1 & gender == "ambiguous"
replace gender_09_0 = "male" if male_flag == 1 & gender_09_0 == "ambiguous"
replace gender_09_50 = "male" if male_flag == 1 & gender_09_50 == "ambiguous"
replace gender_09_100 = "male" if male_flag == 1 & gender_09_100 == "ambiguous"

replace gender_08_0 = "male" if male_flag == 1 & gender_08_0 == "ambiguous"
replace gender_08_50 = "male" if male_flag == 1 & gender_08_50 == "ambiguous"
replace gender_08_100 = "male" if male_flag == 1 & gender_08_100 == "ambiguous"

replace gender_07_0 = "male" if male_flag == 1 & gender_07_0 == "ambiguous"
replace gender_07_50 = "male" if male_flag == 1 & gender_07_50 == "ambiguous"
replace gender_07_100 = "male" if male_flag == 1 & gender_07_100 == "ambiguous"

replace gender_06_0 = "male" if male_flag == 1 & gender_06_0 == "ambiguous"
replace gender_06_50 = "male" if male_flag == 1 & gender_06_50 == "ambiguous"
replace gender_06_100 = "male" if male_flag == 1 & gender_06_100 == "ambiguous"

replace gender_05_0 = "male" if male_flag == 1 & gender_05_0 == "ambiguous"
replace gender_05_50 = "male" if male_flag == 1 & gender_05_50 == "ambiguous"
replace gender_05_100 = "male" if male_flag == 1 & gender_05_100 == "ambiguous"

replace gender = "female" if male_flag == 0 & gender == "ambiguous"
replace gender_09_0 = "female" if male_flag == 0 & gender_09_0 == "ambiguous"
replace gender_09_50 = "female" if male_flag == 0 & gender_09_50 == "ambiguous"
replace gender_09_100 = "female" if male_flag == 0 & gender_09_100 == "ambiguous"

replace gender_08_0 = "female" if male_flag == 0 & gender_08_0 == "ambiguous"
replace gender_08_50 = "female" if male_flag == 0 & gender_08_50 == "ambiguous"
replace gender_08_100 = "female" if male_flag == 0 & gender_08_100 == "ambiguous"

replace gender_07_0 = "female" if male_flag == 0 & gender_07_0 == "ambiguous"
replace gender_07_50 = "female" if male_flag == 0 & gender_07_50 == "ambiguous"
replace gender_07_100 = "female" if male_flag == 0 & gender_07_100 == "ambiguous"

replace gender_06_0 = "female" if male_flag == 0 & gender_06_0 == "ambiguous"
replace gender_06_50 = "female" if male_flag == 0 & gender_06_50 == "ambiguous"
replace gender_06_100 = "female" if male_flag == 0 & gender_06_100 == "ambiguous"

replace gender_05_0 = "female" if male_flag == 0 & gender_05_0 == "ambiguous"
replace gender_05_50 = "female" if male_flag == 0 & gender_05_50 == "ambiguous"
replace gender_05_100 = "female" if male_flag == 0 & gender_05_100 == "ambiguous"

order patent_id inventor_name inventor_sequence first_name_clean last_name_clean gender_09_100 gender_io_09_100 race90
sort patent_id inventor_sequence

save $dir/temp/g_inventor_gender_race.dta, replace


* Raw data from Kaltenberg et al. (2023)
import delimited $dir/rawdata/inventor_age_score_gender.csv, varnames(1) clear 
format %20s inventor_city inventor_state inventor_first_name inventor_last_name
drop radaris_birthyear spokeo_birthyear been_birthyear peoplefinders_birthyear prob_male_min
rename (inventor_city inventor_state inventor_first_name inventor_last_name inventor_id gender) (city_kjl state_kjl first_name_kjl last_name_kjl inventor_id_20180528 gender_kjl)
duplicates drop inventor_id_20180528, force
save $dir/temp/inventor_age_score_gender.dta, replace

* This file links past inventor_id (20180528) to current one (20220929)
import delimited $dir/rawdata/g_persistent_inventor.tsv, varnames(1) clear
rename disamb_inventor* inventor*
keep inventor_id_20180528 inventor_id_20220929
rename inventor_id_20220929 inventor_id
duplicates drop inventor_id, force

merge 1:m inventor_id using $dir/temp/g_inventor_gender_race.dta // 20,407,160
drop if _merge != 3
drop _merge
format %30s inventor_id

merge m:1 inventor_id_20180528 using $dir/temp/inventor_age_score_gender.dta // 6,232,741
drop if _merge == 2
drop _merge

order patent_id inventor_name inventor_sequence first_name_clean last_name_clean birthyear gender_09_100 gender_io_09_100 race80 gender_kjl male_flag
sort patent_id inventor_sequence

* Use gender_kjl to complement Genderize.io and male_flag
replace gender = "male" if gender_kjl=="M" & gender == "ambiguous"
replace gender_09_0 = "male" if gender_kjl=="M" & gender_09_0 == "ambiguous"
replace gender_09_50 = "male" if gender_kjl=="M" & gender_09_50 == "ambiguous"
replace gender_09_100 = "male" if gender_kjl=="M" & gender_09_100 == "ambiguous"

replace gender_08_0 = "male" if gender_kjl=="M" & gender_08_0 == "ambiguous"
replace gender_08_50 = "male" if gender_kjl=="M" & gender_08_50 == "ambiguous"
replace gender_08_100 = "male" if gender_kjl=="M" & gender_08_100 == "ambiguous"

replace gender_07_0 = "male" if gender_kjl=="M" & gender_07_0 == "ambiguous"
replace gender_07_50 = "male" if gender_kjl=="M" & gender_07_50 == "ambiguous"
replace gender_07_100 = "male" if gender_kjl=="M" & gender_07_100 == "ambiguous"

replace gender_06_0 = "male" if gender_kjl=="M" & gender_06_0 == "ambiguous"
replace gender_06_50 = "male" if gender_kjl=="M" & gender_06_50 == "ambiguous"
replace gender_06_100 = "male" if gender_kjl=="M" & gender_06_100 == "ambiguous"

replace gender_05_0 = "male" if gender_kjl=="M" & gender_05_0 == "ambiguous"
replace gender_05_50 = "male" if gender_kjl=="M" & gender_05_50 == "ambiguous"
replace gender_05_100 = "male" if gender_kjl=="M" & gender_05_100 == "ambiguous"

* If gender is "ambiguous" and gender_kjl=="F", replace gender with "female"
replace gender = "female" if gender_kjl=="F" & gender == "ambiguous"
replace gender_09_0 = "female" if gender_kjl=="F" & gender_09_0 == "ambiguous"
replace gender_09_50 = "female" if gender_kjl=="F" & gender_09_50 == "ambiguous"
replace gender_09_100 = "female" if gender_kjl=="F" & gender_09_100 == "ambiguous"

replace gender_08_0 = "female" if gender_kjl=="F" & gender_08_0 == "ambiguous"
replace gender_08_50 = "female" if gender_kjl=="F" & gender_08_50 == "ambiguous"
replace gender_08_100 = "female" if gender_kjl=="F" & gender_08_100 == "ambiguous"

replace gender_07_0 = "female" if gender_kjl=="F" & gender_07_0 == "ambiguous"
replace gender_07_50 = "female" if gender_kjl=="F" & gender_07_50 == "ambiguous"
replace gender_07_100 = "female" if gender_kjl=="F" & gender_07_100 == "ambiguous"

replace gender_06_0 = "female" if gender_kjl=="F" & gender_06_0 == "ambiguous"
replace gender_06_50 = "female" if gender_kjl=="F" & gender_06_50 == "ambiguous"
replace gender_06_100 = "female" if gender_kjl=="F" & gender_06_100 == "ambiguous"

replace gender_05_0 = "female" if gender_kjl=="F" & gender_05_0 == "ambiguous"
replace gender_05_50 = "female" if gender_kjl=="F" & gender_05_50 == "ambiguous"
replace gender_05_100 = "female" if gender_kjl=="F" & gender_05_100 == "ambiguous"

* Gender indicator just based on male_flag and Kaltenberg et al. (2017)
* 1 is male, 0 is female
gen gender_ind = 1 if gender_kjl=="M" | male_flag==1
replace gender_ind = 0 if gender_kjl=="F" | male_flag==0

* Number of inventors
bys patent_id: gen num_inventors = _N

* Number of male/female inventors
bys patent_id: egen num_inventors_m = sum(gender_09_100 == "male")
bys patent_id: egen num_inventors_f = sum(gender_09_100 == "female")


*** Gender of the lead inventor
sort patent_id inventor_sequence

gen lead_gender = gender
gen lead_gender_prob = gender_prob
gen lead_gender_count = gender_count
gen lead_gender_ind = gender_ind
bys patent_id: replace lead_gender_ind = lead_gender_ind[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_prob = lead_gender_prob[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_count = lead_gender_count[_n-1] if inventor_sequence != 0

gen lead_gender_09_0 = gender_09_0
gen lead_gender_09_50 = gender_09_50
gen lead_gender_09_100 = gender_09_100
gen lead_gender_io_09_100 = gender_io_09_100

gen lead_gender_08_0 = gender_08_0
gen lead_gender_08_50 = gender_08_50
gen lead_gender_08_100 = gender_08_100
gen lead_gender_io_08_100 = gender_io_08_100

gen lead_gender_07_0 = gender_07_0
gen lead_gender_07_50 = gender_07_50
gen lead_gender_07_100 = gender_07_100
gen lead_gender_io_07_100 = gender_io_07_100

gen lead_gender_06_0 = gender_06_0
gen lead_gender_06_50 = gender_06_50
gen lead_gender_06_100 = gender_06_100
gen lead_gender_io_06_100 = gender_io_06_100

gen lead_gender_05_0 = gender_05_0
gen lead_gender_05_50 = gender_05_50
gen lead_gender_05_100 = gender_05_100
gen lead_gender_io_05_100 = gender_io_05_100

bys patent_id: replace lead_gender = lead_gender[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_09_0 = lead_gender_09_0[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_09_50 = lead_gender_09_50[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_09_100 = lead_gender_09_100[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_io_09_100 = lead_gender_io_09_100[_n-1] if inventor_sequence != 0

bys patent_id: replace lead_gender_08_0 = lead_gender_08_0[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_08_50 = lead_gender_08_50[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_08_100 = lead_gender_08_100[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_io_08_100 = lead_gender_io_08_100[_n-1] if inventor_sequence != 0

bys patent_id: replace lead_gender_07_0 = lead_gender_07_0[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_07_50 = lead_gender_07_50[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_07_100 = lead_gender_07_100[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_io_07_100 = lead_gender_io_07_100[_n-1] if inventor_sequence != 0

bys patent_id: replace lead_gender_06_0 = lead_gender_06_0[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_06_50 = lead_gender_06_50[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_06_100 = lead_gender_06_100[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_io_06_100 = lead_gender_io_06_100[_n-1] if inventor_sequence != 0

bys patent_id: replace lead_gender_05_0 = lead_gender_05_0[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_05_50 = lead_gender_05_50[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_05_100 = lead_gender_05_100[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_gender_io_05_100 = lead_gender_io_05_100[_n-1] if inventor_sequence != 0


* sanity check of lead_gender
by patent_id: assert (lead_gender_prob[1]==lead_gender_prob[_N])
by patent_id: assert (lead_gender_count[1]==lead_gender_count[_N])
by patent_id: assert (lead_gender_ind[1]==lead_gender_ind[_N])
by patent_id: assert (lead_gender[1]==lead_gender[_N])

by patent_id: assert (lead_gender_09_0[1]==lead_gender_09_0[_N])
by patent_id: assert (lead_gender_09_50[1]==lead_gender_09_50[_N])
by patent_id: assert (lead_gender_09_100[1]==lead_gender_09_100[_N])
by patent_id: assert (lead_gender_io_09_100[1]==lead_gender_io_09_100[_N])

by patent_id: assert (lead_gender_08_0[1]==lead_gender_08_0[_N])
by patent_id: assert (lead_gender_08_50[1]==lead_gender_08_50[_N])
by patent_id: assert (lead_gender_08_100[1]==lead_gender_08_100[_N])
by patent_id: assert (lead_gender_io_08_100[1]==lead_gender_io_08_100[_N])

by patent_id: assert (lead_gender_07_0[1]==lead_gender_07_0[_N])
by patent_id: assert (lead_gender_07_50[1]==lead_gender_07_50[_N])
by patent_id: assert (lead_gender_07_100[1]==lead_gender_07_100[_N])
by patent_id: assert (lead_gender_io_07_100[1]==lead_gender_io_07_100[_N])

by patent_id: assert (lead_gender_06_0[1]==lead_gender_06_0[_N])
by patent_id: assert (lead_gender_06_50[1]==lead_gender_06_50[_N])
by patent_id: assert (lead_gender_06_100[1]==lead_gender_06_100[_N])
by patent_id: assert (lead_gender_io_06_100[1]==lead_gender_io_06_100[_N])

by patent_id: assert (lead_gender_05_0[1]==lead_gender_05_0[_N])
by patent_id: assert (lead_gender_05_50[1]==lead_gender_05_50[_N])
by patent_id: assert (lead_gender_05_100[1]==lead_gender_05_100[_N])
by patent_id: assert (lead_gender_io_05_100[1]==lead_gender_io_05_100[_N])


*** Race of lead inventor
sort patent_id inventor_sequence

gen lead_race = race
gen lead_race90 = race90
gen lead_race80 = race80
gen lead_race70 = race70
bys patent_id: replace lead_race = lead_race[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_race90 = lead_race90[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_race80 = lead_race80[_n-1] if inventor_sequence != 0
bys patent_id: replace lead_race70 = lead_race70[_n-1] if inventor_sequence != 0

* sanity check of lead race
by patent_id: assert (lead_race[1]==lead_race[_N])
by patent_id: assert (lead_race90[1]==lead_race90[_N])
by patent_id: assert (lead_race80[1]==lead_race80[_N])
by patent_id: assert (lead_race70[1]==lead_race70[_N])


***Generate birthyear of lead/oldest/youngest/average inventor
sort patent_id inventor_sequence
gen lead_birthyear = birthyear
bys patent_id: replace lead_birthyear = lead_birthyear[_n-1] if inventor_sequence != 0
bys patent_id: egen oldest_birthyear = min(birthyear)
bys patent_id: egen youngest_birthyear = max(birthyear)
bys patent_id: egen avg_birthyear = mean(birthyear)

* sanity check of birthyears of inventors
by patent_id: assert (lead_birthyear[1]==lead_birthyear[_N])
by patent_id: assert (oldest_birthyear[1]==oldest_birthyear[_N])
by patent_id: assert (youngest_birthyear[1]==youngest_birthyear[_N])
by patent_id: assert (avg_birthyear[1]==avg_birthyear[_N])


*** Generate firstpub_year of lead_inventor, oldest inventor, and average of inventor team
merge m:1 patent_id using $dir/temp/patid_year.dta // 20,407,160
drop if _merge != 3
drop _merge
sort inventor_id patent_year
by inventor_id: assert (inventor_name[1]==inventor_name[_N])
bys inventor_id: gen mask = _n
gen firstpub_year = patent_year if mask == 1
replace firstpub_year = firstpub_year[_n-1] if inventor_id == inventor_id[_n-1]

* sanity checks
by inventor_id: assert (firstpub_year[1]==firstpub_year[_N])
assert firstpub_year <= patent_year
assert firstpub_year == patent_year if mask == 1
drop mask

*** Generate patent-level "experience" variables
sort patent_id inventor_sequence

* Experience of lead/oldest/youngest/average inventor
gen lead_experience = patent_year - firstpub_year if inventor_sequence == 0
bys patent_id: replace lead_experience = lead_experience[_n-1] if inventor_sequence != 0
bys patent_id: egen oldest_experience = max(patent_year - firstpub_year)
bys patent_id: egen youngest_experience = min(patent_year - firstpub_year)
bys patent_id: egen avg_experience = mean(patent_year - firstpub_year)

* sanity check of experiences of inventors
by patent_id: assert (lead_experience[1]==lead_experience[_N])
by patent_id: assert (oldest_experience[1]==oldest_experience[_N])
by patent_id: assert (youngest_experience[1]==youngest_experience[_N])
by patent_id: assert (avg_experience[1]==avg_experience[_N])

merge m:1 patent_id using $dir/temp/g_assignee_patentlevel.dta // 19,186,306 
drop if _merge == 2
drop _merge
drop assignee_type assignee_state assignee_city assignee_location_id assignee_latitude assignee_longitude assignee_state_fips

save $dir/cleandata/g_inventor_gender_race_age.dta, replace
export delimited using $dir/cleandata/g_inventor_gender_race_age.csv, replace

